# Required Packages
import pandas as pd
import numpy as np
import pickle
import os
import json
from collections import defaultdict
# preprocessing
from sklearn.impute import SimpleImputer
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
![]()
In this article, we investigate a set simulated dataset that mimics customer behavior on the Starbucks rewards mobile app. Starbucks tends to send out offers to users of the mobile app once every few days. These offers are exclusive, that is not all users receive the same offer. An offer can contain a discount for their products or sometimes BOGO (buy one get one free). These offers have a validity period before the offer expires. The article here is inspired by a towardsdatascience.com article.
The data is contained in three files:
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
Colors = ['Blue', 'Green', 'Red', 'Yellow', 'Black', 'Magenta', 'Cyan']
Path = 'StarBucks'
Files = os.listdir(Path)
Files = Search_List('.json', Files)
Files_Info = pd.DataFrame()
for i in range(len(Files)):
# Read files
filename = Files[i].split('.')[0].title()
Header(filename, C = Colors[i])
globals() [filename] = pd.read_json(os.path.join(Path, Files[i]), orient='records', lines=True)
Temp = globals() [filename].shape
Files_Info = Files_Info.append(pd.DataFrame({'File':[filename], 'Number of Instances': [Temp[0]],
'Number of Attributes': [Temp[1]]}), ignore_index = True)
Temp = globals() [filename].columns
globals() [filename].columns = [x.title().replace('Id','ID').replace('_',' ') for x in Temp]
# Display
display(globals() [filename].head(5))
Line()
display(Files_Info.style.hide_index())
Line()
del Files, i, Temp, filename
Portfolio ==========================================================================================
| Reward | Channels | Difficulty | Duration | Offer Type | ID | |
|---|---|---|---|---|---|---|
| 0 | 10 | [email, mobile, social] | 10 | 7 | bogo | ae264e3637204a6fb9bb56bc8210ddfd |
| 1 | 10 | [web, email, mobile, social] | 10 | 5 | bogo | 4d5c57ea9a6940dd891ad53e9dbe8da0 |
| 2 | 0 | [web, email, mobile] | 0 | 4 | informational | 3f207df678b143eea3cee63160fa8bed |
| 3 | 5 | [web, email, mobile] | 5 | 7 | bogo | 9b98b8c7a33c4b65b9aebfe6a799e6d9 |
| 4 | 5 | [web, email] | 20 | 10 | discount | 0b1e1539f2cc45b7b9fa7c272da2e1d7 |
Profile ============================================================================================
| Gender | Age | ID | Became Member On | Income | |
|---|---|---|---|---|---|
| 0 | None | 118 | 68be06ca386d4c31939f3a4f0e3dd783 | 20170212 | NaN |
| 1 | F | 55 | 0610b486422d4921ae7d2bf64640c50b | 20170715 | 112000.0 |
| 2 | None | 118 | 38fe809add3b4fcf9315a9694bb96ff5 | 20180712 | NaN |
| 3 | F | 75 | 78afa995795e4d85b5d9ceeca43f5fef | 20170509 | 100000.0 |
| 4 | None | 118 | a03223e636434f42ac4c3df47e8bac43 | 20170804 | NaN |
Transcript =========================================================================================
| Person | Event | Value | Time | |
|---|---|---|---|---|
| 0 | 78afa995795e4d85b5d9ceeca43f5fef | offer received | {'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'} | 0 |
| 1 | a03223e636434f42ac4c3df47e8bac43 | offer received | {'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'} | 0 |
| 2 | e2127556f4f64592b11af22de27a7932 | offer received | {'offer id': '2906b810c7d4411798c6938adc9daaa5'} | 0 |
| 3 | 8ec6ce2a7e7949b1bf142def7d0e0586 | offer received | {'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'} | 0 |
| 4 | 68617ca6246f4fbc85e91a2a49552598 | offer received | {'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'} | 0 |
====================================================================================================
| File | Number of Instances | Number of Attributes |
|---|---|---|
| Portfolio | 10 | 6 |
| Profile | 17000 | 5 |
| Transcript | 306534 | 4 |
====================================================================================================
| Feature | Description |
|---|---|
| Reward (int) | Given reward for completing an offer |
| Channels (list of strings) | Email, mobile app, social media, etc |
| Difficulty (int) | Minimum spending requirement for completing an offer |
| Duration (int) | Time that an offer is valid |
| Offer Type (string) | Type of offer |
| ID (string) | Offer ID |
| Feature | Description |
|---|---|
| Gender (str) | Customers gender |
| Age (int) | Customers age |
| ID (str) | Customers ID |
| Became Member On (int) | Date of membership |
| Income (float) | Customer's income |
| Feature | Description |
|---|---|
| Person (str) | Customer ID |
| Event (str) | Record description |
| time (int) | Time in hours (since the beginning of the study) |
| Value - (dict of strings) | Offer ID or transaction amount |
def Data_Plot(Inp, Title, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
def dtypes_group(Inp, Dict = False):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Features','Count'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Features'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
Out.loc[Out.index == c, 'Count'] = len(Temp.loc[Temp['Data Type'] == c].index.tolist())
Out.index.name = 'Data Type'
Out = Out.reset_index(drop = False)
Out['Data Type'] = Out['Data Type'].astype(str)
if Dict:
Out = dict(zip(Out['Data Type'], Out['Features']))
return Out
Temp = Portfolio['Channels'].str.join(',').map(lambda x: x.title()).str.get_dummies(',')
Feat_Dict = dict(zip(Temp.columns, ['Channels' + ': ' + x for x in Temp.columns]))
Portfolio = pd.concat([Portfolio, Temp], axis = 1)
del Temp
Portfolio = Portfolio.drop(['Channels'], axis = 1)
Portfolio['Offer Type'] = Portfolio['Offer Type'].map(lambda x: x.upper() if x =='bogo' else x.title())
Portfolio.rename(columns={'ID':'Offer ID'}, inplace=True)
display(Portfolio.style.background_gradient(subset= ['Reward'], cmap='Greens').\
background_gradient(subset= ['Difficulty'], cmap='Oranges'))
_ = Data_Plot(Portfolio, 'Portfolio Dataste', W = 550)
| Reward | Difficulty | Duration | Offer Type | Offer ID | Mobile | Social | Web | ||
|---|---|---|---|---|---|---|---|---|---|
| 0 | 10 | 10 | 7 | BOGO | ae264e3637204a6fb9bb56bc8210ddfd | 1 | 1 | 1 | 0 |
| 1 | 10 | 10 | 5 | BOGO | 4d5c57ea9a6940dd891ad53e9dbe8da0 | 1 | 1 | 1 | 1 |
| 2 | 0 | 0 | 4 | Informational | 3f207df678b143eea3cee63160fa8bed | 1 | 1 | 0 | 1 |
| 3 | 5 | 5 | 7 | BOGO | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | 1 | 1 | 0 | 1 |
| 4 | 5 | 20 | 10 | Discount | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | 1 | 0 | 0 | 1 |
| 5 | 3 | 7 | 7 | Discount | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 1 | 1 | 1 | 1 |
| 6 | 2 | 10 | 10 | Discount | fafdcd668e3743c1bb461111dcafc2a4 | 1 | 1 | 1 | 1 |
| 7 | 0 | 0 | 3 | Informational | 5a8bc65990b245e5a138643cd4eb9837 | 1 | 1 | 1 | 0 |
| 8 | 5 | 5 | 5 | BOGO | f19421c1d4aa40978ebb69ca19b0e20d | 1 | 1 | 1 | 1 |
| 9 | 2 | 10 | 7 | Discount | 2906b810c7d4411798c6938adc9daaa5 | 1 | 1 | 0 | 1 |
_ = Data_Plot(Profile, 'Profile Dataste', W = 450)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 5))
_ = sns.kdeplot(Profile['Age'], shade=True, color='HotPink', ax = ax, legend = False)
_ = ax.set_xlim([0, 140])
_ = ax.set_ylim([0, .025])
_ = ax.set_yticklabels(ax.get_yticks(), size = 13)
_ = ax.set_yticklabels(ax.get_yticks(), size = 13)
_ = ax.set_title('Age Kernel Density', fontsize =14)
One possible reason behind having ages over 100 could be misentering age during user registration. To address this issue, we can search for ages over 100 and impute them with median ages.
Profile.loc[Profile['Age']>=100, 'Age'] = np.nan
imp = SimpleImputer(missing_values=np.nan, strategy='median')
_ = imp.fit(Profile['Age'].values.reshape(-1,1))
Profile['Age'] = imp.transform(Profile['Age'].values.reshape(-1,1))
Profile['Age'] = Profile['Age'].astype('int32')
del imp
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 5))
_ = sns.kdeplot(Profile['Age'], shade=True, color='LimeGreen', ax = ax, legend = False)
_ = ax.set_xlim([0, 120])
_ = ax.set_ylim([0, .05])
_ = ax.set_yticklabels(ax.get_yticks(), size = 13)
_ = ax.set_yticklabels(ax.get_yticks(), size = 13)
_ = ax.set_title('Age Kernel Density', fontsize =14)
# converting 'Became Member On' column to a columns with date entries
Profile['Became Member On'] = pd.to_datetime(Profile['Became Member On'], format='%Y%m%d')
# keeping only the year from 'Became Member On' column as 'Member Since Year'
Profile['Member Since Year'] = Profile['Became Member On'].dt.year
Note that
Profile.sort_values(by='Became Member On', ascending=True).tail()
| Gender | Age | ID | Became Member On | Income | Member Since Year | |
|---|---|---|---|---|---|---|
| 8079 | M | 82 | 039e0c539c2f4fc582423412a8bceb40 | 2018-07-26 | 32000.0 | 2018 |
| 12326 | F | 59 | 3b9ec536e4224688950913b9cf53362c | 2018-07-26 | 88000.0 | 2018 |
| 10282 | M | 46 | e4989ca3d8974e28a3fe87aa0b0ce327 | 2018-07-26 | 74000.0 | 2018 |
| 12902 | M | 85 | f1f01be4e2344bf08b7f488c115ad374 | 2018-07-26 | 45000.0 | 2018 |
| 4357 | M | 76 | 6597e5ab63754726b83ba2032f008d26 | 2018-07-26 | 92000.0 | 2018 |
Assuming the data is a snapshot of the end of 2018. get member tenure by the number of months.
\begin{align} \text{Member Tenure} =\text{The End of 2018}-\text{Became Member On}\qquad (\text{In Months}) \end{align}Temp = pd.to_datetime('20181231', format='%Y%m%d')
Profile['Member Tenure'] = round((Temp - Profile['Became Member On'])/np.timedelta64(1, 'M')).astype('int32')
del Temp
Profile['Gender'] = Profile['Gender'].replace({'F':'Female', 'M': 'Male', 'O': 'Other'}).fillna('Other')
Imputing NaN values.
imp = SimpleImputer(missing_values=np.nan, strategy='median')
_ = imp.fit(Profile['Income'].values.reshape(-1,1))
Profile['Income'] = imp.transform(Profile['Income'].values.reshape(-1,1))
Profile['Income'] = Profile['Income'].astype(int)
del imp
Thus,
display(Profile.head(10).style.set_precision(2).background_gradient(subset= ['Income'], cmap='Greens').\
background_gradient(subset= ['Member Tenure'], cmap='Oranges'))
_ = Data_Plot(Profile, 'Profile Dataste', W = 550)
| Gender | Age | ID | Became Member On | Income | Member Since Year | Member Tenure | |
|---|---|---|---|---|---|---|---|
| 0 | Other | 55 | 68be06ca386d4c31939f3a4f0e3dd783 | 2017-02-12 00:00:00 | 64000 | 2017 | 23 |
| 1 | Female | 55 | 0610b486422d4921ae7d2bf64640c50b | 2017-07-15 00:00:00 | 112000 | 2017 | 18 |
| 2 | Other | 55 | 38fe809add3b4fcf9315a9694bb96ff5 | 2018-07-12 00:00:00 | 64000 | 2018 | 6 |
| 3 | Female | 75 | 78afa995795e4d85b5d9ceeca43f5fef | 2017-05-09 00:00:00 | 100000 | 2017 | 20 |
| 4 | Other | 55 | a03223e636434f42ac4c3df47e8bac43 | 2017-08-04 00:00:00 | 64000 | 2017 | 17 |
| 5 | Male | 68 | e2127556f4f64592b11af22de27a7932 | 2018-04-26 00:00:00 | 70000 | 2018 | 8 |
| 6 | Other | 55 | 8ec6ce2a7e7949b1bf142def7d0e0586 | 2017-09-25 00:00:00 | 64000 | 2017 | 15 |
| 7 | Other | 55 | 68617ca6246f4fbc85e91a2a49552598 | 2017-10-02 00:00:00 | 64000 | 2017 | 15 |
| 8 | Male | 65 | 389bc3fa690240e798340f5a15918d5c | 2018-02-09 00:00:00 | 53000 | 2018 | 11 |
| 9 | Other | 55 | 8974fc5686fe429db53ddde067b88302 | 2016-11-22 00:00:00 | 64000 | 2016 | 25 |
Looking at the value column from the Transcript Data, we notice that, this data needs to be processed to be used for our analysis. Thus, we are searching for different key-value pairs. That can be done as follows
# We are looking for unique key values
value_keys = []
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval=Transcript.shape[0],
widgets=[progressbar.Bar('#', '|', '|'), progressbar.Percentage()])
# The Loop
Progress_Bar.start()
for i in range(Transcript.shape[0]):
Counter+=1
if Transcript.Value[i].keys() not in value_keys:
value_keys.append(Transcript.Value[i].keys())
Progress_Bar.finish()
|#########################################################################|100%
Now, we divide offer id into individual columns
Temp = Transcript['Value'].apply(pd.Series)
Transcript = pd.concat([Transcript, Temp], axis=1)
del Temp
Let's look at the resulting DataFrame.
display(Transcript.head(8).style.set_properties(subset=['offer id'], **{'background-color': 'DarkRed', 'color': 'White'}).\
set_properties(subset=['offer_id'], **{'background-color': 'Indigo', 'color': 'White'}))
| Person | Event | Value | Time | offer id | amount | offer_id | reward | |
|---|---|---|---|---|---|---|---|---|
| 0 | 78afa995795e4d85b5d9ceeca43f5fef | offer received | {'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'} | 0 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | nan | nan | nan |
| 1 | a03223e636434f42ac4c3df47e8bac43 | offer received | {'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'} | 0 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | nan | nan | nan |
| 2 | e2127556f4f64592b11af22de27a7932 | offer received | {'offer id': '2906b810c7d4411798c6938adc9daaa5'} | 0 | 2906b810c7d4411798c6938adc9daaa5 | nan | nan | nan |
| 3 | 8ec6ce2a7e7949b1bf142def7d0e0586 | offer received | {'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'} | 0 | fafdcd668e3743c1bb461111dcafc2a4 | nan | nan | nan |
| 4 | 68617ca6246f4fbc85e91a2a49552598 | offer received | {'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'} | 0 | 4d5c57ea9a6940dd891ad53e9dbe8da0 | nan | nan | nan |
| 5 | 389bc3fa690240e798340f5a15918d5c | offer received | {'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'} | 0 | f19421c1d4aa40978ebb69ca19b0e20d | nan | nan | nan |
| 6 | c4863c7985cf408faee930f111475da3 | offer received | {'offer id': '2298d6c36e964ae4a3e7e9706d1fb8c2'} | 0 | 2298d6c36e964ae4a3e7e9706d1fb8c2 | nan | nan | nan |
| 7 | 2eeac8d8feae4a8cad5a6af0499a211d | offer received | {'offer id': '3f207df678b143eea3cee63160fa8bed'} | 0 | 3f207df678b143eea3cee63160fa8bed | nan | nan | nan |
Note that Offer ID appears in two separate columns since there are two different key types for Offer ID. We would like to create a new column to combine the two Offer ID columns.
Transcript['Temp'] = np.where(Transcript['offer id'].isnull() & Transcript['offer_id'].notnull(),
Transcript['offer_id'],Transcript['offer id'])
Transcript = Transcript.drop(['offer id','offer_id'], axis=1)
Transcript = Transcript.rename(columns={'Temp':'Offer ID', 'amount': 'Amount', 'reward': 'Reward'})
Transcript['Event'] = Transcript['Event'].apply(lambda x: x.title())
display(Transcript.head(8))
| Person | Event | Value | Time | Amount | Reward | Offer ID | |
|---|---|---|---|---|---|---|---|
| 0 | 78afa995795e4d85b5d9ceeca43f5fef | Offer Received | {'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'} | 0 | NaN | NaN | 9b98b8c7a33c4b65b9aebfe6a799e6d9 |
| 1 | a03223e636434f42ac4c3df47e8bac43 | Offer Received | {'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'} | 0 | NaN | NaN | 0b1e1539f2cc45b7b9fa7c272da2e1d7 |
| 2 | e2127556f4f64592b11af22de27a7932 | Offer Received | {'offer id': '2906b810c7d4411798c6938adc9daaa5'} | 0 | NaN | NaN | 2906b810c7d4411798c6938adc9daaa5 |
| 3 | 8ec6ce2a7e7949b1bf142def7d0e0586 | Offer Received | {'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'} | 0 | NaN | NaN | fafdcd668e3743c1bb461111dcafc2a4 |
| 4 | 68617ca6246f4fbc85e91a2a49552598 | Offer Received | {'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'} | 0 | NaN | NaN | 4d5c57ea9a6940dd891ad53e9dbe8da0 |
| 5 | 389bc3fa690240e798340f5a15918d5c | Offer Received | {'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'} | 0 | NaN | NaN | f19421c1d4aa40978ebb69ca19b0e20d |
| 6 | c4863c7985cf408faee930f111475da3 | Offer Received | {'offer id': '2298d6c36e964ae4a3e7e9706d1fb8c2'} | 0 | NaN | NaN | 2298d6c36e964ae4a3e7e9706d1fb8c2 |
| 7 | 2eeac8d8feae4a8cad5a6af0499a211d | Offer Received | {'offer id': '3f207df678b143eea3cee63160fa8bed'} | 0 | NaN | NaN | 3f207df678b143eea3cee63160fa8bed |
Merging Transcript with Portfolio to get offer type.
Data = Transcript.merge(Portfolio, how='left',on='Offer ID')
However, this creates some undesired columns that we need to modify
# Getting rid of value column
Data.drop('Value', axis=1, inplace=True)
# renaming reward_x and reward_y to Reward Received and reward_defined, respectively.
Data.rename(columns={'Reward_x':'Reward Received', 'Reward_y':'Reward Defined'},inplace=True)
Therefore,
Data.head(8)
| Person | Event | Time | Amount | Reward Received | Offer ID | Reward Defined | Difficulty | Duration | Offer Type | Mobile | Social | Web | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 78afa995795e4d85b5d9ceeca43f5fef | Offer Received | 0 | NaN | NaN | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | 5.0 | 5.0 | 7.0 | BOGO | 1.0 | 1.0 | 0.0 | 1.0 |
| 1 | a03223e636434f42ac4c3df47e8bac43 | Offer Received | 0 | NaN | NaN | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | 5.0 | 20.0 | 10.0 | Discount | 1.0 | 0.0 | 0.0 | 1.0 |
| 2 | e2127556f4f64592b11af22de27a7932 | Offer Received | 0 | NaN | NaN | 2906b810c7d4411798c6938adc9daaa5 | 2.0 | 10.0 | 7.0 | Discount | 1.0 | 1.0 | 0.0 | 1.0 |
| 3 | 8ec6ce2a7e7949b1bf142def7d0e0586 | Offer Received | 0 | NaN | NaN | fafdcd668e3743c1bb461111dcafc2a4 | 2.0 | 10.0 | 10.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 4 | 68617ca6246f4fbc85e91a2a49552598 | Offer Received | 0 | NaN | NaN | 4d5c57ea9a6940dd891ad53e9dbe8da0 | 10.0 | 10.0 | 5.0 | BOGO | 1.0 | 1.0 | 1.0 | 1.0 |
| 5 | 389bc3fa690240e798340f5a15918d5c | Offer Received | 0 | NaN | NaN | f19421c1d4aa40978ebb69ca19b0e20d | 5.0 | 5.0 | 5.0 | BOGO | 1.0 | 1.0 | 1.0 | 1.0 |
| 6 | c4863c7985cf408faee930f111475da3 | Offer Received | 0 | NaN | NaN | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 3.0 | 7.0 | 7.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 7 | 2eeac8d8feae4a8cad5a6af0499a211d | Offer Received | 0 | NaN | NaN | 3f207df678b143eea3cee63160fa8bed | 0.0 | 0.0 | 4.0 | Informational | 1.0 | 1.0 | 0.0 | 1.0 |
Portfolio.to_csv (Path + '\Portfolio_Clean.csv', index = None, header=True)
Profile.to_csv (Path + '\Profile_Clean.csv', index = None, header=True)
Transcript.to_csv (Path + '\Transcript_Clean.csv', index = None, header=True)
Data.to_csv(Path + '\Data.csv', index = None, header=True)
An offer is considered an effective offers if a customer receives an offer, and then view the received offer, and finally, complete a transaction either BOGO and Discount Offers, or complete a transaction using Informational Offers within the period those offers are valid.
Moroever, an offer that is not classified as effective is considered here as ineffective offers.
Consider the following diagram:
Effective Offers:
Ineffective Offers:
The object of the exercise here is to predict the most suitable offer type that is needed to be sent out to each customer. In doing so, we need to combine the dataset to build a general database that provides a summary of each user's transactional behaviors.
source = [0, 0, 1, 3, 4, 5, 5, 2, 8, 9, 9, 8, 12, 13, 14, 2, 16]
target = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Labels = ['Offers', 'Effective Offers', 'Ineffective Offers', 'Offer Received', 'Offer Viewed', 'Transaction',
'Offer Completed (BOGO and Discount offers)', 'Offer Completed (Informational Offers)',
'Offer Received','No action happened','No transaction happened', 'Offer not Viewed',
'Offer not Viewed','Offer not Viewed','Transaction', 'Offer viewed', 'Offer not received',
'Transaction', 'Offer not received']
fig = go.Figure(go.Sankey(link = dict(source = source, target = target, value = [0.1 for i in range(len(Labels))]),
node = dict(label = Labels, pad = 15, thickness = 5)))
fig.update_layout(title={'text': '<b>' + 'Effective Offers vs. Ineffective Offers' + '<b>',
'x':0.5, 'y':0.92, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Let's introduce some new features.
| New Features | Full Features name | Description |
|---|---|---|
| BOGO Offer Rec | BOGO Offer Received | The number of BOGO offer received |
| Disc Offer Rec | Discount Offer Received | The number of Discount offer received |
| Info Offer Rec | Informational Offer Received | The number of Informational offer received |
TempKey = ['BOGO Offer Rec','Disc Offer Rec','Info Offer Rec']
TempVals = ['BOGO Offer Received','Discount Offer Received','Informational Offer Received']
Temp = dict(zip(TempKey,TempVals))
Feat_Dict.update(Temp)
del Temp, TempKey, TempVals
Temp = Data[Data['Event']=='Offer Received']
Temp = Temp.groupby(['Person','Offer Type'])['Offer Type'].agg('count').unstack(level=-1)
# Creating a user data dataframe
# Assigning profile IDs as index of the new dataframe
User_Data = pd.DataFrame(index = Profile['ID'])
# adding Temp to this dataframe
User_Data = User_Data.join(Temp)
del Temp
# renaming
User_Data.rename(columns={'BOGO':'BOGO Offer Rec',
'Discount':'Disc Offer Rec',
'Informational':'Info Offer Rec'},inplace=True)
User_Data.head()
| BOGO Offer Rec | Disc Offer Rec | Info Offer Rec | |
|---|---|---|---|
| ID | |||
| 68be06ca386d4c31939f3a4f0e3dd783 | NaN | 5.0 | NaN |
| 0610b486422d4921ae7d2bf64640c50b | 1.0 | NaN | 1.0 |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | NaN | 1.0 |
| 78afa995795e4d85b5d9ceeca43f5fef | 3.0 | NaN | 1.0 |
| a03223e636434f42ac4c3df47e8bac43 | NaN | 3.0 | 2.0 |
As can be seen, not all customers received all offers. Let's investigate this a bit further. Since we have three columns (for the three offers), if we have NaN greater than 2, then it means that the customers haven't received any offers!
No_Offer = User_Data.isnull().sum(axis=1)
# Creating a list
No_Offer_List=list(No_Offer[No_Offer>2].index)
# Print
print(Fore.BLUE + Style.NORMAL + 'The number of customers that never received any offers' + Style.RESET_ALL
+' = %i' % No_Offer[No_Offer>2].count())
The number of customers that never received any offers = 6
We can test the results. For example, for the first user from No_Offer_List, we have
display(Data[Data['Person'] == No_Offer_List[0]])
del No_Offer, No_Offer_List
| Person | Event | Time | Amount | Reward Received | Offer ID | Reward Defined | Difficulty | Duration | Offer Type | Mobile | Social | Web | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 69477 | c6e579c6821c41d1a7a6a9cf936e91bb | Transaction | 174 | 0.65 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 87239 | c6e579c6821c41d1a7a6a9cf936e91bb | Transaction | 222 | 1.91 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 163610 | c6e579c6821c41d1a7a6a9cf936e91bb | Transaction | 408 | 1.25 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 243317 | c6e579c6821c41d1a7a6a9cf936e91bb | Transaction | 570 | 3.14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Adding more features to User_Data dataframe, we can introduce new parameters.
| New Features | Full Features name | Description |
|---|---|---|
| Tot Tran Cnt | Total Transaction Count | Whether or not an offer incentivized a transaction |
| Tot Tran Amnt | Total Transaction Amount | Total transaction amount |
| Tot Rewards Rec | Total Rewards Received | The number of received rewards from an offer type |
| Ave Tran Amnt | Average Transaction Amount | Total transaction amount (TTA) / Total transaction count (TTC) |
TempKey = ['Tot Tran Cnt','Tot Tran Amnt','Tot Rewards Rec','Ave Tran Amnt']
TempVals = ['Total Transaction Count','Total Transaction Amount', 'Total Rewards Received', 'Average Transaction Amount']
Temp = dict(zip(TempKey,TempVals))
Feat_Dict.update(Temp)
del Temp, TempKey, TempVals
# Selecting those who had transactions
Temp = Data[Data['Event'] =='Transaction']
# Group by Event for each Person (Total Transactions per person)
Temp = Temp.groupby(['Person'])['Event'].agg({'count'})
Temp.columns = ['Tot Tran Cnt']
# Addding it to User_Data
User_Data = User_Data.join(Temp)
del Temp
User_Data.head()
| BOGO Offer Rec | Disc Offer Rec | Info Offer Rec | Tot Tran Cnt | |
|---|---|---|---|---|
| ID | ||||
| 68be06ca386d4c31939f3a4f0e3dd783 | NaN | 5.0 | NaN | 9.0 |
| 0610b486422d4921ae7d2bf64640c50b | 1.0 | NaN | 1.0 | 3.0 |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | NaN | 1.0 | 6.0 |
| 78afa995795e4d85b5d9ceeca43f5fef | 3.0 | NaN | 1.0 | 7.0 |
| a03223e636434f42ac4c3df47e8bac43 | NaN | 3.0 | 2.0 | 3.0 |
# Selecting those who had transactions
Temp = Data[Data['Event'] =='Transaction']
# Group by Event for each Person (Total Transactions per person)
Temp = Temp.groupby(['Person'])['Amount'].agg({'sum'})
Temp.columns = ['Tot Tran Amnt']
# Addding it to User_Data
User_Data = User_Data.join(Temp)
del Temp
User_Data.head()
| BOGO Offer Rec | Disc Offer Rec | Info Offer Rec | Tot Tran Cnt | Tot Tran Amnt | |
|---|---|---|---|---|---|
| ID | |||||
| 68be06ca386d4c31939f3a4f0e3dd783 | NaN | 5.0 | NaN | 9.0 | 20.40 |
| 0610b486422d4921ae7d2bf64640c50b | 1.0 | NaN | 1.0 | 3.0 | 77.01 |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | NaN | 1.0 | 6.0 | 14.30 |
| 78afa995795e4d85b5d9ceeca43f5fef | 3.0 | NaN | 1.0 | 7.0 | 159.27 |
| a03223e636434f42ac4c3df47e8bac43 | NaN | 3.0 | 2.0 | 3.0 | 4.65 |
# Group by Event for each Person (Total Transactions per person)
Temp = Data.groupby(['Person'])['Reward Received'].agg({'sum'})
Temp.columns = ['Tot Rewards Rec']
# Addding it to User_Data
User_Data = User_Data.join(Temp)
del Temp
User_Data.head()
| BOGO Offer Rec | Disc Offer Rec | Info Offer Rec | Tot Tran Cnt | Tot Tran Amnt | Tot Rewards Rec | |
|---|---|---|---|---|---|---|
| ID | ||||||
| 68be06ca386d4c31939f3a4f0e3dd783 | NaN | 5.0 | NaN | 9.0 | 20.40 | 5.0 |
| 0610b486422d4921ae7d2bf64640c50b | 1.0 | NaN | 1.0 | 3.0 | 77.01 | 5.0 |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | NaN | 1.0 | 6.0 | 14.30 | 0.0 |
| 78afa995795e4d85b5d9ceeca43f5fef | 3.0 | NaN | 1.0 | 7.0 | 159.27 | 20.0 |
| a03223e636434f42ac4c3df47e8bac43 | NaN | 3.0 | 2.0 | 3.0 | 4.65 | 0.0 |
User_Data['Ave Tran Amnt'] = round(User_Data['Tot Tran Amnt']/User_Data['Tot Tran Cnt'],3)
Now we add more parameters to the User Data dataframe.
| New Features | Full Features name | Description |
|---|---|---|
| Tran Comp NO off | Transactions Count without any Offers | The number of transactions for a user completes without an offer |
| Tran Comp off | Transactions Count with Offers | The number of transactions for a user completes with an offer |
| Tran Amnt NO off | Transactions Amount without any Offers | The amount of transactions for a user completes without an offer |
| Tran Amnt off | Transactions Amount with Offers | The amount of transactions for a user completes with an offer |
| Offer Tran Cnt Ratio | Offer Transactions Count Ratio | Transaction count incentivized by offer / Total number of transactions - TC Of/TTC |
| Offer Tran Amnt Ratio | Offer Transactions Amount Ratio | Transaction amount incentivized by offer / Total amount of transactions - TA Of/TTA |
| Offer Comp View Ratio | Offer Completed View Ratio | The number of offer completed / the number of offer viewed |
| Offer Comp Rec Ratio | Offer Completed Receive Ratio | The number of offers completed/ the number of offers received |
| Tran Amnt per Offer | Transactions Amount per Offer | The total transaction amount with offer/ Total number of transactions with an offer |
| Reward per Offer | Reward per Offer | Total rewards received / the number of offer completed |
| BOGO Comp | BOGO Completed | The number of offer completed for BOGO offer after viewing the BOGO offer |
| Disc Comp | Discount Completed | The number of offer completed for Discount offer after viewing the Discount offer |
| Info Comp | Information Completed | The number of offer completed for Information offer after viewing the Information offer |
| BOGO Conv | BOGO offer conversion rate | The number of BOGO offers completed / The number of BOGO offer received - BOGO Comp/BOGO OR |
| Info Conv | Information offer conversion rate | The number of Information offers completed / The number of Information offer received - Info Comp/Info OR |
| Disc Conv | Discount offer conversion rate | The number of Discount offers completed / The number of Discount offer received - Disc Comp/Disc OR |
TempKey = ['Tran Comp NO off','Tran Comp off','Tran Amnt NO off','Tran Amnt off','Offer Tran Cnt Ratio','Offer Tran Amnt Ratio',
'Offer Comp View Ratio','Offer Comp Rec Ratio','Tran Amnt per Offer','Reward per Offer','BOGO Comp','Disc Comp',
'Info Comp','BOGO Conv','Info Conv','Disc Conv']
TempVals = ['Transactions Count without any Offers','Transactions Count with Offers','Transactions Amount without any Offers',
'Transactions Amount with Offers','Offer Transactions Count Ratio','Offer Transactions Amount Ratio',
'Offer Completed View Ratio','Offer Completed Receive Ratio','Transactions Amount per Offer',
'Reward per Offer','BOGO Completed','Discount Completed','Information Completed','BOGO offer conversion rate',
'Information offer conversion rate','Discount offer conversion rate']
Temp = dict(zip(TempKey,TempVals))
Feat_Dict.update(Temp)
del Temp, TempKey, TempVals
Given the fact that there is no explicit offer conversion for the parameters that just being introduced. We need to use nested loops based on the number of offer completed transaction and transactions amount from the offer conversion.
We can create a function, User_Record that takes a user_id and gives all user records as a DataFrame. Then, this function is used to investigate the data based on assumptions that were made at the beginning of this section.
User_Record = lambda user_id: Data[Data['Person'] == user_id].reset_index(drop=True)
For example, we can try
User_Record('68be06ca386d4c31939f3a4f0e3dd783')
| Person | Event | Time | Amount | Reward Received | Offer ID | Reward Defined | Difficulty | Duration | Offer Type | Mobile | Social | Web | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Received | 168 | NaN | NaN | 2906b810c7d4411798c6938adc9daaa5 | 2.0 | 10.0 | 7.0 | Discount | 1.0 | 1.0 | 0.0 | 1.0 |
| 1 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Viewed | 216 | NaN | NaN | 2906b810c7d4411798c6938adc9daaa5 | 2.0 | 10.0 | 7.0 | Discount | 1.0 | 1.0 | 0.0 | 1.0 |
| 2 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Received | 336 | NaN | NaN | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | 5.0 | 20.0 | 10.0 | Discount | 1.0 | 0.0 | 0.0 | 1.0 |
| 3 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Viewed | 348 | NaN | NaN | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | 5.0 | 20.0 | 10.0 | Discount | 1.0 | 0.0 | 0.0 | 1.0 |
| 4 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 360 | 0.35 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Received | 408 | NaN | NaN | fafdcd668e3743c1bb461111dcafc2a4 | 2.0 | 10.0 | 10.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 6 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Viewed | 408 | NaN | NaN | fafdcd668e3743c1bb461111dcafc2a4 | 2.0 | 10.0 | 10.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 7 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 414 | 0.74 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 8 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 444 | 1.89 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 9 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Received | 504 | NaN | NaN | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 3.0 | 7.0 | 7.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 10 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Viewed | 504 | NaN | NaN | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 3.0 | 7.0 | 7.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 11 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 510 | 5.04 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 534 | 0.38 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 13 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 552 | 2.12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 14 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Completed | 552 | NaN | 2.0 | fafdcd668e3743c1bb461111dcafc2a4 | 2.0 | 10.0 | 10.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 15 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Completed | 552 | NaN | 3.0 | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 3.0 | 7.0 | 7.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 16 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Received | 576 | NaN | NaN | fafdcd668e3743c1bb461111dcafc2a4 | 2.0 | 10.0 | 10.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 17 | 68be06ca386d4c31939f3a4f0e3dd783 | Offer Viewed | 582 | NaN | NaN | fafdcd668e3743c1bb461111dcafc2a4 | 2.0 | 10.0 | 10.0 | Discount | 1.0 | 1.0 | 1.0 | 1.0 |
| 18 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 606 | 4.61 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 19 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 630 | 0.06 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 20 | 68be06ca386d4c31939f3a4f0e3dd783 | Transaction | 696 | 5.21 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
and get all records of '68be06ca386d4c31939f3a4f0e3dd783'
Note: This might take some time to process!
# Creating an empty dictionary (list)
Person_Effective_Offer = defaultdict(list)
# Creating a list of 'Offer ID' for only 'BOGO' and 'Discount' offer types
Bogo_Discount_Offer_IDS = Portfolio[Portfolio['Offer Type'].isin(['BOGO', 'Discount'])]['Offer ID'].values
# Person's Transactions from viewed offers (Dictionary)
Person_Off_Trans = defaultdict(lambda: defaultdict(float))
# initial value for Counter
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval=User_Data.shape[0],
widgets=[progressbar.Bar('*', '[', ']'), progressbar.Percentage()])
# Starting the Progress Bar
Progress_Bar.start()
# The outer loop for ALL users in the User_Data
for user_id in User_Data.index:
Counter+=1
Progress_Bar.update(Counter)
# getting the records for the current user
Transaction_Received = User_Record(user_id)
# The index of the received offers
Offer_Received_Index = Transaction_Received[Transaction_Received['Event'] == 'Offer Received'].index.values
# All transactions for the current user
transactions = Transaction_Received[Transaction_Received['Event'] == 'Transaction']
# Transaction offer index set as empty (for the inside of the inner loop)
Transaction_Offer_Index = []
# The iner loop for the received offers indeces
for R_ind in Offer_Received_Index:
# offer id corresponding to the index
offer_id = Transaction_Received.loc[R_ind, 'Offer ID']
# offer validity time (convert days to hours using 24 as a multiplier)
Offer_Valid_For = int(Portfolio[Portfolio['Offer ID'] == offer_id]['Duration']) * 24
# offer start time
Offer_Start = Transaction_Received.loc[R_ind, 'Time']
# offer end time = offer start time + offer validity time
Offer_End = Offer_Start + Offer_Valid_For
# viewed offers (within the offer validity interval)
viewed = Transaction_Received[(Transaction_Received['Offer ID'] == offer_id) &
(Transaction_Received['Event'] == 'Offer Viewed') &
(Transaction_Received['Time'] >= Offer_Start) &
(Transaction_Received['Time'] <= Offer_End)]
# only if "viewed offers" is not empty
if viewed.shape[0] >0 :
# add one to the person's viewed offer column 'Offer Viewed'
Person_Off_Trans[user_id]['Offer Viewed'] += 1
# offer_id from the above list 'Bogo_Discount_Offer_IDS'
if offer_id in Bogo_Discount_Offer_IDS:
# checking whether a viewed offer has been completed (within the offer validity interval)
completed = Transaction_Received[(Transaction_Received['Offer ID'] == offer_id) &
(Transaction_Received['Event'] == 'Offer Completed') &
(Transaction_Received['Time'] >= Offer_Start) &
(Transaction_Received['Time'] <= Offer_End)]
# only if "completed offers" is not zero (has some enteries)
if completed.shape[0] > 0 :
View_Ind = viewed.index[0]
comp_ind = completed.index[0]
Conv_Offer_Type = completed['Offer Type'].values[0]
Offer_Difficulty = completed['Difficulty'].values[0]
# if the number of viewed offers is less than the number of completed offers
if View_Ind < comp_ind:
Person_Off_Trans[user_id]['Offer Completed'] += 1
Person_Off_Trans[user_id][Conv_Offer_Type] += 1
BOGO_Disc_Trans = transactions.loc[View_Ind:comp_ind]['Amount'].sum()
Person_Off_Trans[user_id]['Offer Trans Amnt'] += BOGO_Disc_Trans
Person_Off_Trans[user_id]['Offer Difficulty'] += Offer_Difficulty
if offer_id not in Person_Effective_Offer[user_id]:
Person_Effective_Offer[user_id].append(offer_id)
else:
View_Time = viewed.iloc[0]['Time']
# add one to the person's viewed offer column 'Offer Viewed'
Person_Off_Trans[user_id]['Offer Viewed'] += 1
# checking whether a viewed offer has been completed (within the offer validity interval)
Info_Trans = Transaction_Received[(Transaction_Received['Event'] == 'Transaction') &
(Transaction_Received['Time'] >= View_Time) &
(Transaction_Received['Time'] <= Offer_End)]
# only if "Info_Trans" is not zero (has some enteries)
if Info_Trans.shape[0] > 0:
Person_Off_Trans[user_id]['Offer Completed'] += 1
Info_Tran_Amt = Info_Trans['Amount'].sum()
Person_Off_Trans[user_id]['Offer Trans Amnt'] += Info_Tran_Amt
Person_Off_Trans[user_id]['Informational'] +=1
if offer_id not in Person_Effective_Offer[user_id]:
Person_Effective_Offer[user_id].append(offer_id)
# End of the seraching process
Progress_Bar.finish()
# deleting unnecessary variables
del Progress_Bar, Transaction_Received, Offer_Received_Index, transactions, Transaction_Offer_Index
del offer_id, completed, View_Ind, comp_ind, Conv_Offer_Type, Offer_Difficulty
del BOGO_Disc_Trans, Person_Effective_Offer, View_Time, Info_Trans
[*************************************************************************]100%
Once the searching process has been completed, we can convert the output to a data frame.
TempKey = ['BOGO comp', 'Disc comp', 'Info comp']
TempVals = ['BOGO Offer Completed', 'Discount Offer Completed', 'Informational Offer Completed']
Temp = dict(zip(TempKey,TempVals))
Feat_Dict.update(Temp)
del Temp, TempKey, TempVals
# converting the dictionary from the above searching process into data frame
Person_Off_Trans_df = pd.DataFrame.from_dict(Person_Off_Trans, orient = 'index')
# 'comp' stands for completed!
Person_Off_Trans_df = Person_Off_Trans_df.rename(columns={'BOGO': 'BOGO comp', 'Discount': 'Disc comp',
'Informational': 'Info comp'})
Person_Off_Trans_df.head()
| Offer Viewed | Offer Completed | Disc comp | Offer Trans Amnt | Offer Difficulty | BOGO comp | Info comp | |
|---|---|---|---|---|---|---|---|
| 68be06ca386d4c31939f3a4f0e3dd783 | 5.0 | 2.0 | 2.0 | 17.71 | 17.0 | NaN | NaN |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 78afa995795e4d85b5d9ceeca43f5fef | 5.0 | 3.0 | NaN | 91.00 | 15.0 | 2.0 | 1.0 |
| a03223e636434f42ac4c3df47e8bac43 | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| e2127556f4f64592b11af22de27a7932 | 3.0 | 2.0 | 1.0 | 36.84 | 15.0 | 1.0 | NaN |
Now, \begin{align} \mbox{BOGO Offer} &= \begin{cases} 0 &\mbox{if BOGO_comp is NaN} \\1 & \mbox{else}\end{cases}\\ \mbox{Disc offer} &= \begin{cases} 0 &\mbox{if Disc_comp is NaN} \\1 & \mbox{else}\end{cases}\\ \mbox{Info offer} &= \begin{cases} 0 &\mbox{if Info_comp is NaN} \\1 & \mbox{else}\end{cases} \end{align}
Feat_Dict.update({'BOGO offer':'BOGO Offers', 'Disc offer': 'Discount Offers', 'Info offer':'Informational Offers'})
Person_Off_Trans_df['BOGO offer'] = Person_Off_Trans_df['BOGO comp'].apply(lambda x: 0 if np.isnan(x) else 1)
Person_Off_Trans_df['Disc offer'] = Person_Off_Trans_df['Disc comp'].apply(lambda x: 0 if np.isnan(x) else 1)
Person_Off_Trans_df['Info offer'] = Person_Off_Trans_df['Info comp'].apply(lambda x: 0 if np.isnan(x) else 1)
Person_Off_Trans_df.head()
| Offer Viewed | Offer Completed | Disc comp | Offer Trans Amnt | Offer Difficulty | BOGO comp | Info comp | BOGO offer | Disc offer | Info offer | |
|---|---|---|---|---|---|---|---|---|---|---|
| 68be06ca386d4c31939f3a4f0e3dd783 | 5.0 | 2.0 | 2.0 | 17.71 | 17.0 | NaN | NaN | 0 | 1 | 0 |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 |
| 78afa995795e4d85b5d9ceeca43f5fef | 5.0 | 3.0 | NaN | 91.00 | 15.0 | 2.0 | 1.0 | 1 | 0 | 1 |
| a03223e636434f42ac4c3df47e8bac43 | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 |
| e2127556f4f64592b11af22de27a7932 | 3.0 | 2.0 | 1.0 | 36.84 | 15.0 | 1.0 | NaN | 1 | 1 | 0 |
Merging the current dataframe, Person_Off_Trans_df, with the User_Data dataframe.
temp = User_Data.join(Person_Off_Trans_df)
User_Data=temp
del temp
User_Data.head()
| BOGO Offer Rec | Disc Offer Rec | Info Offer Rec | Tot Tran Cnt | Tot Tran Amnt | Tot Rewards Rec | Ave Tran Amnt | Offer Viewed | Offer Completed | Disc comp | Offer Trans Amnt | Offer Difficulty | BOGO comp | Info comp | BOGO offer | Disc offer | Info offer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||||
| 68be06ca386d4c31939f3a4f0e3dd783 | NaN | 5.0 | NaN | 9.0 | 20.40 | 5.0 | 2.267 | 5.0 | 2.0 | 2.0 | 17.71 | 17.0 | NaN | NaN | 0.0 | 1.0 | 0.0 |
| 0610b486422d4921ae7d2bf64640c50b | 1.0 | NaN | 1.0 | 3.0 | 77.01 | 5.0 | 25.670 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | NaN | 1.0 | 6.0 | 14.30 | 0.0 | 2.383 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 |
| 78afa995795e4d85b5d9ceeca43f5fef | 3.0 | NaN | 1.0 | 7.0 | 159.27 | 20.0 | 22.753 | 5.0 | 3.0 | NaN | 91.00 | 15.0 | 2.0 | 1.0 | 1.0 | 0.0 | 1.0 |
| a03223e636434f42ac4c3df47e8bac43 | NaN | 3.0 | 2.0 | 3.0 | 4.65 | 0.0 | 1.550 | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 |
Creating a column No Offer such that
$$\text{No Offer}= \begin{cases} 1, & \mbox{if there is no offer} \\ 0, & \mbox{if there is at least an offer (BOGO, Discount, Informational)}. \end{cases} $$User_Data['No Offer'] = np.where(User_Data['Offer Completed'].isnull(), 1, 0)
For example, we can see the customer that had no offers:
User_Data.loc[User_Data['No Offer']==1,:].head()
| BOGO Offer Rec | Disc Offer Rec | Info Offer Rec | Tot Tran Cnt | Tot Tran Amnt | Tot Rewards Rec | Ave Tran Amnt | Offer Viewed | Offer Completed | Disc comp | Offer Trans Amnt | Offer Difficulty | BOGO comp | Info comp | BOGO offer | Disc offer | Info offer | No Offer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | ||||||||||||||||||
| 0610b486422d4921ae7d2bf64640c50b | 1.0 | NaN | 1.0 | 3.0 | 77.01 | 5.0 | 25.670 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | NaN | 1.0 | 6.0 | 14.30 | 0.0 | 2.383 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 1 |
| a03223e636434f42ac4c3df47e8bac43 | NaN | 3.0 | 2.0 | 3.0 | 4.65 | 0.0 | 1.550 | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 1 |
| 8ec6ce2a7e7949b1bf142def7d0e0586 | 1.0 | 3.0 | 1.0 | NaN | NaN | 0.0 | NaN | 6.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 1 |
| 68617ca6246f4fbc85e91a2a49552598 | 3.0 | 2.0 | NaN | 2.0 | 0.24 | 0.0 | 0.120 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 1 |
TempKey = ['Offer Tran Cnt Ratio', 'Offer Trans Amnt', 'Offer Trans Amnt Ratio', 'Offer Comp View Ratio',
'Tran Amnt per Offer', 'Reward per Offer', 'Difficulty per Offer']
TempVals = ['Offer Transaction Count Ratio', 'Total Transaction Amount', 'Offer Transaction Amount Ratio',
'Viewed Offer Completed Ratio','Transaction Amount per Offer', 'Reward per offer', 'Difficulty per Offer']
Temp = dict(zip(TempKey,TempVals))
Feat_Dict.update(Temp)
del Temp, TempKey, TempVals
# Offer Transaction Count Ratio
User_Data['Offer Tran Cnt Ratio'] = round(User_Data['Offer Completed']/ User_Data['Tot Tran Cnt'],2)
# Offer Transaction Amount
User_Data['Offer Trans Amnt'] = np.where(User_Data['Offer Trans Amnt']>User_Data['Tot Tran Amnt'],
User_Data['Tot Tran Amnt'], User_Data['Offer Trans Amnt'])
# Offer Transaction Amount Ratio
User_Data['Offer Trans Amnt Ratio'] = round(User_Data['Offer Trans Amnt']/ User_Data['Tot Tran Amnt'],2)
# Viewed Offer Completed Ratio
User_Data['Offer Comp View Ratio'] = User_Data['Offer Completed']/ User_Data['Offer Viewed']
# Received Offer Completed Ratio
User_Data['Offer Comp Rec Ratio'] = User_Data['Offer Completed']/ User_Data[['BOGO Offer Rec','Disc Offer Rec',
'Info Offer Rec']].sum(axis=1)
# Transaction Amount per Offer
User_Data['Tran Amnt per Offer'] = round(User_Data['Offer Trans Amnt']/ User_Data['Offer Completed'],2)
# Reward per offer
User_Data['Reward per Offer'] = round(User_Data['Tot Rewards Rec']/ User_Data['Offer Completed'],2)
# Difficulty per Offer
User_Data['Difficulty per Offer'] = round(User_Data['Offer Difficulty']/ User_Data['Offer Completed'],2)
In some cases, customers receive rewards without completing any offers. To correct those values, we amount Total Rewards Received and Total Rewards Received values to zero where Offer Completed is zero!
User_Data['Tot Rewards Rec'] = np.where(User_Data['Offer Completed']==0, 0 , User_Data['Tot Rewards Rec'] )
User_Data['Reward per Offer'] = np.where(User_Data['Offer Completed']==0, 0 , User_Data['Reward per Offer'] )
User_Data.fillna(0, inplace=True)
User_Data.head()
| BOGO Offer Rec | Disc Offer Rec | Info Offer Rec | Tot Tran Cnt | Tot Tran Amnt | Tot Rewards Rec | Ave Tran Amnt | Offer Viewed | Offer Completed | Disc comp | ... | Disc offer | Info offer | No Offer | Offer Tran Cnt Ratio | Offer Trans Amnt Ratio | Offer Comp View Ratio | Offer Comp Rec Ratio | Tran Amnt per Offer | Reward per Offer | Difficulty per Offer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||||||||
| 68be06ca386d4c31939f3a4f0e3dd783 | 0.0 | 5.0 | 0.0 | 9.0 | 20.40 | 5.0 | 2.267 | 5.0 | 2.0 | 2.0 | ... | 1.0 | 0.0 | 0 | 0.22 | 0.87 | 0.4 | 0.40 | 8.86 | 2.50 | 8.5 |
| 0610b486422d4921ae7d2bf64640c50b | 1.0 | 0.0 | 1.0 | 3.0 | 77.01 | 5.0 | 25.670 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0 |
| 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | 0.0 | 1.0 | 6.0 | 14.30 | 0.0 | 2.383 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0 |
| 78afa995795e4d85b5d9ceeca43f5fef | 3.0 | 0.0 | 1.0 | 7.0 | 159.27 | 20.0 | 22.753 | 5.0 | 3.0 | 0.0 | ... | 0.0 | 1.0 | 0 | 0.43 | 0.57 | 0.6 | 0.75 | 30.33 | 6.67 | 5.0 |
| a03223e636434f42ac4c3df47e8bac43 | 0.0 | 3.0 | 2.0 | 3.0 | 4.65 | 0.0 | 1.550 | 5.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0 |
5 rows × 25 columns
Moreover,
Feat_Dict.update({'Gender Female':'Gender: Female', 'Gender Male':'Gender: Male', 'Gender Other':'Gender: Other'})
Temp = pd.get_dummies(Profile, columns=['Gender'])
Temp.index = Temp['ID']
Temp.drop(['Became Member On','Member Since Year','ID'], axis=1, inplace=True)
Temp.columns = [x.replace('_',' ') for x in Temp.columns]
Temp0 = User_Data.join(Temp)
User_Data=Temp0
del Temp, Temp0
User_Data = User_Data.reset_index(drop = False)
User_Data.head()
| ID | BOGO Offer Rec | Disc Offer Rec | Info Offer Rec | Tot Tran Cnt | Tot Tran Amnt | Tot Rewards Rec | Ave Tran Amnt | Offer Viewed | Offer Completed | ... | Offer Comp Rec Ratio | Tran Amnt per Offer | Reward per Offer | Difficulty per Offer | Age | Income | Member Tenure | Gender Female | Gender Male | Gender Other | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68be06ca386d4c31939f3a4f0e3dd783 | 0.0 | 5.0 | 0.0 | 9.0 | 20.40 | 5.0 | 2.267 | 5.0 | 2.0 | ... | 0.40 | 8.86 | 2.50 | 8.5 | 55 | 64000 | 23 | 0 | 0 | 1 |
| 1 | 0610b486422d4921ae7d2bf64640c50b | 1.0 | 0.0 | 1.0 | 3.0 | 77.01 | 5.0 | 25.670 | 0.0 | 0.0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 55 | 112000 | 18 | 1 | 0 | 0 |
| 2 | 38fe809add3b4fcf9315a9694bb96ff5 | 1.0 | 0.0 | 1.0 | 6.0 | 14.30 | 0.0 | 2.383 | 1.0 | 0.0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 55 | 64000 | 6 | 0 | 0 | 1 |
| 3 | 78afa995795e4d85b5d9ceeca43f5fef | 3.0 | 0.0 | 1.0 | 7.0 | 159.27 | 20.0 | 22.753 | 5.0 | 3.0 | ... | 0.75 | 30.33 | 6.67 | 5.0 | 75 | 100000 | 20 | 1 | 0 | 0 |
| 4 | a03223e636434f42ac4c3df47e8bac43 | 0.0 | 3.0 | 2.0 | 3.0 | 4.65 | 0.0 | 1.550 | 5.0 | 0.0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 55 | 64000 | 17 | 0 | 0 | 1 |
5 rows × 32 columns
Temp = {}
for c in Data.columns:
if not c in Feat_Dict.keys():
Temp[c] = c
for c in User_Data.columns:
if not c in Feat_Dict.keys():
Temp[c] = c
del Temp, c
User_Data.to_csv(Path+ '\\User_Data.csv', index = None, header=True)
with open(Path + '\Feat_Dict.pkl', 'wb') as fp:
pickle.dump(Feat_Dict, fp, protocol=pickle.HIGHEST_PROTOCOL)